### lets import all the necessary packages !
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import numpy as np
## read data
uber_15=pd.read_csv(r'C:\Users\manje\Downloads\Projects\uber/uber-raw-data-janjune-15.csv',encoding='utf-8')
## view of data
uber_15.head(2)
| Dispatching_base_num | Pickup_date | Affiliated_base_num | locationID | |
|---|---|---|---|---|
| 0 | B02617 | 2015-05-17 09:47:00 | B02617 | 141 |
| 1 | B02617 | 2015-05-17 09:47:00 | B02617 | 65 |
## getting dimensions of data
uber_15.shape
(14270479, 4)
## getting count of total duplicated observations in your data
uber_15.duplicated().sum()
898225
## deleting all the duplicated observations
uber_15.drop_duplicates(inplace=True)
uber_15.shape
(13372254, 4)
### what about data-type of your features
uber_15.dtypes
Dispatching_base_num object Pickup_date object Affiliated_base_num object locationID int64 dtype: object
we can see that "Pickup_date" is a object data type, Therefore, we have to convert this datatype into date-time bcz at the end we have to extract couple of Derived attributes. For this we require pandas to_datetime to convert object data type to datetime dtype..
uber_15['Pickup_date']=pd.to_datetime(uber_15['Pickup_date'], format ='%Y-%m-%d %H:%M:%S' )
uber_15['Pickup_date'].dtype
dtype('<M8[ns]')
## extracting month from 'Pickup_date'..
uber_15['month']=uber_15['Pickup_date'].dt.month
uber_15['month'].value_counts().plot(kind='bar')
<Axes: >
## extracting dervied features (weekday ,day ,hour ,month ,minute) from 'Pickup_date'..
uber_15['weekday']=uber_15['Pickup_date'].dt.day_name()
uber_15['day']=uber_15['Pickup_date'].dt.day
uber_15['hour']=uber_15['Pickup_date'].dt.hour
uber_15['month']=uber_15['Pickup_date'].dt.month
uber_15['minute']=uber_15['Pickup_date'].dt.minute
uber_15.head(2)
| Dispatching_base_num | Pickup_date | Affiliated_base_num | locationID | month | weekday | day | hour | minute | |
|---|---|---|---|---|---|---|---|---|---|
| 0 | B02617 | 2015-05-17 09:47:00 | B02617 | 141 | 5 | Sunday | 17 | 9 | 47 |
| 1 | B02617 | 2015-05-17 09:47:00 | B02617 | 65 | 5 | Sunday | 17 | 9 | 47 |
temp=uber_15.groupby(['month','weekday'],as_index=False).size()
temp.head()
| month | weekday | size | |
|---|---|---|---|
| 0 | 1 | Friday | 339285 |
| 1 | 1 | Monday | 190606 |
| 2 | 1 | Saturday | 386049 |
| 3 | 1 | Sunday | 230487 |
| 4 | 1 | Thursday | 330319 |
temp['month'].unique()
array([1, 2, 3, 4, 5, 6], dtype=int64)
dict_month={1:'Jan', 2:'Feb', 3:'March', 4:'april', 5:'May', 6:'June'}
temp['month']=temp['month'].map(dict_month)
temp['month']
0 Jan 1 Jan 2 Jan 3 Jan 4 Jan 5 Jan 6 Jan 7 Feb 8 Feb 9 Feb 10 Feb 11 Feb 12 Feb 13 Feb 14 March 15 March 16 March 17 March 18 March 19 March 20 March 21 april 22 april 23 april 24 april 25 april 26 april 27 april 28 May 29 May 30 May 31 May 32 May 33 May 34 May 35 June 36 June 37 June 38 June 39 June 40 June 41 June Name: month, dtype: object
type(uber_15.groupby(['month','weekday']).size())
pandas.core.series.Series
temp
| month | weekday | size | |
|---|---|---|---|
| 0 | Jan | Friday | 339285 |
| 1 | Jan | Monday | 190606 |
| 2 | Jan | Saturday | 386049 |
| 3 | Jan | Sunday | 230487 |
| 4 | Jan | Thursday | 330319 |
| 5 | Jan | Tuesday | 196574 |
| 6 | Jan | Wednesday | 245650 |
| 7 | Feb | Friday | 373550 |
| 8 | Feb | Monday | 274948 |
| 9 | Feb | Saturday | 368311 |
| 10 | Feb | Sunday | 296130 |
| 11 | Feb | Thursday | 335603 |
| 12 | Feb | Tuesday | 287260 |
| 13 | Feb | Wednesday | 286387 |
| 14 | March | Friday | 309631 |
| 15 | March | Monday | 269931 |
| 16 | March | Saturday | 314785 |
| 17 | March | Sunday | 313865 |
| 18 | March | Thursday | 277026 |
| 19 | March | Tuesday | 320634 |
| 20 | March | Wednesday | 256767 |
| 21 | april | Friday | 315002 |
| 22 | april | Monday | 238429 |
| 23 | april | Saturday | 324545 |
| 24 | april | Sunday | 273560 |
| 25 | april | Thursday | 372522 |
| 26 | april | Tuesday | 250632 |
| 27 | april | Wednesday | 338015 |
| 28 | May | Friday | 430134 |
| 29 | May | Monday | 255501 |
| 30 | May | Saturday | 464298 |
| 31 | May | Sunday | 390391 |
| 32 | May | Thursday | 337607 |
| 33 | May | Tuesday | 290004 |
| 34 | May | Wednesday | 316045 |
| 35 | June | Friday | 371225 |
| 36 | June | Monday | 375312 |
| 37 | June | Saturday | 399377 |
| 38 | June | Sunday | 334434 |
| 39 | June | Thursday | 357782 |
| 40 | June | Tuesday | 405500 |
| 41 | June | Wednesday | 328141 |
## create grouped bar chart ..
plt.figure(figsize=(12,8))
sns.barplot(x='month',y='size',hue='weekday',data=temp)
<Axes: xlabel='month', ylabel='size'>
summary=uber_15.groupby(['weekday','hour'],as_index=False).size()
summary
| weekday | hour | size | |
|---|---|---|---|
| 0 | Friday | 0 | 79879 |
| 1 | Friday | 1 | 44563 |
| 2 | Friday | 2 | 27252 |
| 3 | Friday | 3 | 19076 |
| 4 | Friday | 4 | 23049 |
| ... | ... | ... | ... |
| 163 | Wednesday | 19 | 131317 |
| 164 | Wednesday | 20 | 123490 |
| 165 | Wednesday | 21 | 120941 |
| 166 | Wednesday | 22 | 115208 |
| 167 | Wednesday | 23 | 91631 |
168 rows × 3 columns
## pointplot between 'hour' & 'size' for all the weekdays..
plt.figure(figsize=(12,8))
sns.pointplot(x='hour',y='size',hue='weekday',data=summary)
<Axes: xlabel='hour', ylabel='size'>
'''
It's interesting to see that Saturday and Sunday exhibit similar demand throughout the late night/morning/afternoon,
but it exhibits opposite trends during the evening. In the evening, Saturday pickups continue to increase throughout the evening,
but Sunday pickups takes a downward turn after evening..
We can see that there the weekdays that has the most demand during the late evening is Friday and Saturday,
which is expected, but what strikes me is that Thursday nights also exhibits very similar trends as Friday and Saturday nights.
It seems like New Yorkers are starting their 'weekends' on Thursday nights. :)
'''
"\nIt's interesting to see that Saturday and Sunday exhibit similar demand throughout the late night/morning/afternoon, \nbut it exhibits opposite trends during the evening. In the evening, Saturday pickups continue to increase throughout the evening,\nbut Sunday pickups takes a downward turn after evening..\n\nWe can see that there the weekdays that has the most demand during the late evening is Friday and Saturday, \nwhich is expected, but what strikes me is that Thursday nights also exhibits very similar trends as Friday and Saturday nights.\n\nIt seems like New Yorkers are starting their 'weekends' on Thursday nights. :)\n\n\n"
uber_15.head(2)
| Dispatching_base_num | Pickup_date | Affiliated_base_num | locationID | month | weekday | day | hour | minute | |
|---|---|---|---|---|---|---|---|---|---|
| 0 | B02617 | 2015-05-17 09:47:00 | B02617 | 141 | 5 | Sunday | 17 | 9 | 47 |
| 1 | B02617 | 2015-05-17 09:47:00 | B02617 | 65 | 5 | Sunday | 17 | 9 | 47 |
## lets read "Uber-Jan-Feb-FOIL.csv"..
uber_foil=pd.read_csv(r'C:\Users\manje\Downloads\Projects\uber/Uber-Jan-Feb-FOIL.csv')
uber_foil.head()
| dispatching_base_number | date | active_vehicles | trips | |
|---|---|---|---|---|
| 0 | B02512 | 1/1/2015 | 190 | 1132 |
| 1 | B02765 | 1/1/2015 | 225 | 1765 |
| 2 | B02764 | 1/1/2015 | 3427 | 29421 |
| 3 | B02682 | 1/1/2015 | 945 | 7679 |
| 4 | B02617 | 1/1/2015 | 1228 | 9537 |
##!pip install chart_studio
##!pip install plotly
### establishing the entire set-up of Plotly..
import chart_studio.plotly as py
import plotly.graph_objs as go
import plotly.express as px
from plotly.offline import download_plotlyjs ,plot ,iplot ,init_notebook_mode
init_notebook_mode(connected=True)
px.box(x='dispatching_base_number',y='active_vehicles' ,data_frame=uber_foil)
### if u need distribution + 5-summary stats of data , its good to go with violinplot
px.violin(x='dispatching_base_number',y='active_vehicles' ,data_frame=uber_foil)
import os
files=os.listdir(r'C:\Users\manje\Downloads\Projects\uber')[-7:]
files
['uber-raw-data-apr14.csv', 'uber-raw-data-aug14.csv', 'uber-raw-data-janjune-15.csv', 'uber-raw-data-jul14.csv', 'uber-raw-data-jun14.csv', 'uber-raw-data-may14.csv', 'uber-raw-data-sep14.csv']
files.remove('uber-raw-data-janjune-15.csv')
files
['uber-raw-data-apr14.csv', 'uber-raw-data-aug14.csv', 'uber-raw-data-jul14.csv', 'uber-raw-data-jun14.csv', 'uber-raw-data-may14.csv', 'uber-raw-data-sep14.csv']
path=r'C:\Users\manje\Downloads\Projects\uber'
#blank dataframe
final=pd.DataFrame()
for file in files:
current_df=pd.read_csv(path+'/'+file,encoding='utf-8')
final=pd.concat([current_df,final])
### After Collecting entire data ,u might ask is : Do we have duplicate entires in data ?
### We are going to remove duplicates data when the entire rows is duplicated
final.shape
(4534327, 4)
final.head(2)
| Date/Time | Lat | Lon | Base | |
|---|---|---|---|---|
| 0 | 9/1/2014 0:01:00 | 40.2201 | -74.0021 | B02512 |
| 1 | 9/1/2014 0:01:00 | 40.7500 | -74.0027 | B02512 |
### first lets figure out total observations where we have duplicate values..
final.duplicated().sum()
82581
## drop duplicate values ..
### By default, it removes duplicate rows based on all columns.
### To remove duplicates on specific column(s), use subset parameter of 'drop_duplicates()'
### by-default , keep='first which says it will keep first occurence of duplicates...'
final.drop_duplicates(inplace=True)
final.shape
(4451746, 4)
The Base codes are for the following Uber bases:
B02512 : Unter
B02598 : Hinter
B02617 : Weiter
B02682 : Schmecken
B02764 : Danach-NY
->> The globe is split into an imaginary 360 sections from both top to bottom (north to south) and 180 sections from side to side (west to east). The sections running from top to bottom on a globe are called longitude, and the sections running from side to side on a globe are called latitude.
->> Latitude is the measurement of distance north or south of the Equator.
->> Every location on earth has a global address. Because the address is in numbers, people can communicate about location no matter what language they might speak. A global address is given as two numbers called coordinates. The two numbers are a location's latitude number and its longitude number ("Lat/Long").
### ie where-ever we have more data-points or more density, it means more rush is at there !
rush_uber=final.groupby(['Lat','Lon'],as_index=False).size()
rush_uber
| Lat | Lon | size | |
|---|---|---|---|
| 0 | 39.6569 | -74.2258 | 1 |
| 1 | 39.6686 | -74.1607 | 1 |
| 2 | 39.7214 | -74.2446 | 1 |
| 3 | 39.8416 | -74.1512 | 1 |
| 4 | 39.9055 | -74.0791 | 1 |
| ... | ... | ... | ... |
| 574553 | 41.3730 | -72.9237 | 1 |
| 574554 | 41.3737 | -73.7988 | 1 |
| 574555 | 41.5016 | -72.8987 | 1 |
| 574556 | 41.5276 | -72.7734 | 1 |
| 574557 | 42.1166 | -72.0666 | 1 |
574558 rows × 3 columns
#!pip install folium
import folium
basemap=folium.Map()
from folium.plugins import HeatMap
HeatMap(rush_uber).add_to(basemap)
<folium.plugins.heat_map.HeatMap at 0x175beb87ee0>
basemap